from IPython.core.display import HTML
display(HTML("""
<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
<input type="submit" value="Click here to toggle on/off the raw code.">
</form>"""))
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
import matplotlib as mpl
import dateparser
import datetime
import math
import folium
import requests
import json
import calmap
import scipy.cluster.hierarchy as sch
import glob
import warnings
import gzip
import datetime as dt
import matplotlib.cm as cm
import matplotlib.font_manager
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import SpectralClustering, AgglomerativeClustering
from scipy.spatial.distance import euclidean, cityblock
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import adjusted_rand_score as ar
from sklearn.metrics import calinski_harabasz_score, silhouette_score
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
from geopy.geocoders import Nominatim
from collections import Counter
from IPython.display import Image, display
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None
def plot_traffic():
"""Plots the traffic patterns for different days of the week"""
data = pd.read_csv('waze.csv', nrows=50_000)
data['blocks'] = data['blocks'].map(eval)
data = data.explode('blocks')
data.head(5)
data['startTime'] = pd.to_datetime(data.startTime.str[:-10],
format='%Y-%m-%d %H')
data['startTime'] += dt.timedelta(hours=8)
data['date'] = data.startTime.dt.date
data['hour'] = data.startTime.dt.hour
data['dayofweek'] = data.startTime.dt.dayofweek
data['weekend'] = np.where((data.dayofweek == 5) | (data.dayofweek == 6),
1, 0)
data['year'] = pd.DatetimeIndex(data['date']).year
data['month'] = pd.DatetimeIndex(data['date']).month
data['day'] = pd.DatetimeIndex(data['date']).day
df = data[data['weekend'] == 1]
df.hour.unique()
data_2 = data[data['level'] < 5]
df_level_day = data_2[data_2['weekend'] == 0].groupby('hour').mean()
df_level_day['period'] = 'weekday'
df_level_day = df_level_day.loc[:,['level']]
df_level_end = data_2[data_2['weekend'] == 1].groupby('hour').mean()
df_level_end['period'] = 'weekend'
df_level_end = df_level_end.loc[:,['level']]
dist_level = pd.concat([df_level_day,
df_level_end], axis=1).fillna(value=0)
dist_level.columns = ['weekday', 'weekend']
dist_level.plot(figsize=(15,7), xlabel='Hour',
ylabel='Ave. Traffic Level',
title='Ave. Traffic Level per Hour in Pasig City',)
def scrape_weather_wwo():
"""Scrape weather data using API"""
#Dates to scrape
dates = pd.date_range('2018-01-01','2021-08-30' ,
freq='1M')-pd.offsets.MonthBegin(1)
enddates = pd.date_range('2018-02-28','2021-09-02' ,
freq='1M')-pd.offsets.MonthEnd(1)
enddate_list = enddates.strftime("%Y-%m-%d").tolist()
startdate_list = dates.strftime("%Y-%m-%d").tolist()
enddate_list.append('2021-08-31')
startdate_list.append('2021-08-01')
#API
API_KEY = '4b69c33e21fc4877a58102017210209'
df = pd.DataFrame()
for startmonth, endmonth in zip(startdate_list, enddate_list):
data = requests.get(
'http://api.worldweatheronline.com/premium/v1/past-weather.ashx',
params={
'key': API_KEY,
'q': 'Quezon City',
'date': startmonth,
'enddate' : endmonth,
'tp' : 1,
'data' : 'weather',
'format': 'json'}).json()
filename = startmonth + "_" + endmonth + "_" + "hr" ".json"
with open(filename, 'w') as f:
json.dump(res, f)
for day in range(len(data['data']['weather'])):
date = data['data']['weather'][day]['date']
for hour_time in (range(len(data['data']['weather']
[day]['hourly']))):
time = (data['data']['weather'][day]['hourly']
[hour_time]['time'])
tempc = (data['data']['weather'][day]['hourly']
[hour_time]['tempC'])
humidity = (data['data']['weather'][day]['hourly']
[hour_time]['humidity'])
precipmm = (data['data']['weather'][day]['hourly']
[hour_time]['precipMM'])
windspeedkmph = (data['data']['weather'][day]
['hourly'][hour_time]['windspeedKmph'])
visibility = (data['data']['weather'][day]
['hourly'][hour_time]['visibility'])
pressure = (data['data']['weather'][day]['hourly']
[hour_time]['pressure'])
cloudcover = (data['data']['weather'][day]
['hourly'][hour_time]['cloudcover'])
heatindexC = (data['data']['weather'][day]
['hourly'][hour_time]['HeatIndexC'])
dewpointC = (data['data']['weather'][day]
['hourly'][hour_time]['DewPointC'])
windchillC = (data['data']['weather'][day]
['hourly'][hour_time]['WindChillC'])
windgustkmph = (data['data']['weather'][day]['hourly']
[hour_time]['WindGustKmph'])
uvindex = (data['data']['weather'][day]
['hourly'][hour_time]['uvIndex'])
df_row = pd.DataFrame({'date' : date,
'time': time,
'tempc' : tempc,
'precipmm' : precipmm,
'humidity': humidity,
'windspeedkmph': windspeedkmph,
'visibility' : visibility,
'pressure' : pressure,
'cloudcover' : cloudcover,
'heatindexC' : heatindexC,
'dewpointC' : dewpointC,
'windchillC' : windchillC,
'windgustkmph' : windgustkmph,
'uvindex' : uvindex}, index=[0])
df = pd.concat([df, df_row])
df.to_csv('WWO_weather_final_hr.csv')
def merge_weather_df():
"""Merge the DataFrames for each weather dataset"""
wwo = pd.read_csv('../data/World Weather Online/WWO_weather_final_hr.csv')
time = wwo['time'].values
time_new = []
for i in time:
if i == 0:
time_new += [0]
else:
time_new += [int(str(i)[:-2])]
wwo['time_new'] = time_new
wwo['datetime'] = pd.to_datetime(wwo.date) + pd.to_timedelta(wwo.time_new,
unit='h')
wwo.index = wwo['datetime']
date_index2 = pd.date_range(start='2019-04-01',
end='2021-07-31', closed=None,freq='H')
wwo_new = wwo.reindex(date_index2).copy()
wwo_new.fillna(method="ffill",inplace=True)
time_date = pd.read_csv('/mnt/processed/private/msds2022/lt12/'
'DMW Final Project/weather_pasig.csv')
dates2 = [i.split(',')[0].strip() for i in list(time_date.ds.values)]
time_date['dates_day'] = dates2
time_date.dates_day = time_date.dates_day.map(dateparser.parse)
time_ = list(time_date['ts'])
time_new_ = []
for i in time_:
time_new_ += [int(i[:2])]
date_index2 = pd.date_range(start='2010-01-01', end='2021-08-26',
closed=None,freq='H')
time_date_new = time_date.reindex(date_index2).copy()
time_date_new.fillna(method="ffill",inplace=True)
time_date_new['datetime'] = time_date_new.index
wwo_final = wwo.reset_index(level=0, drop=True).copy()
time_date_final = time_date_new.reset_index(level=0, drop=True).copy()
df_weather = wwo_final.merge(time_date_final,
how='inner',
on='datetime').copy()
return df_weather
def plot_hist_rain():
"""Plot daily rainfall for every year"""
df_weather2 = pd.read_csv('df_weather.csv')
df_day_precip_std = df_weather2.groupby('date')['precipmm_std'].sum()
plt.figure(figsize=(15,8))
df_day_precip_std.index = pd.to_datetime(df_day_precip_std.index)
df_day_precip_std[df_day_precip_std.index.year == 2018].hist(alpha = 0.4,
bins = 50, label = 2018)
df_day_precip_std[df_day_precip_std.index.year == 2019].hist(alpha = 0.4,
bins = 50, label = 2019)
df_day_precip_std[df_day_precip_std.index.year == 2020].hist(alpha = 0.4,
bins = 50, label = 2020)
df_day_precip_std[df_day_precip_std.index.year == 2021].hist(alpha = 0.4,
bins = 50, label = 2021)
plt.title('Histogram of daily rainfall in mm', fontsize = 20);
plt.xlim(0,70)
plt.legend();
def bucket_rain(mm):
"""Assign rain classification"""
if mm == 0:
return 'dry'
if mm < 2.5:
return 'drizzle'
if mm < 7.5:
return 'rain'
if mm >= 7.5:
return 'heavyrain'
def plot_rain_classification():
"""Plot frequency per rain classification"""
df_weather2 = pd.read_csv('df_weather.csv')
df_weather2['precipmm_std'] = df_weather2['precipmm_std'].astype(float)
df_weather2['rain_classification_std'] = (df_weather2['precipmm_std']
.apply(bucket_rain))
df_rain_class=df_weather2.groupby('date')['rain_classification_std'].max()
df_rain_class = df_rain_class.reset_index()
df_rain_class['year'] = df_rain_class['date'].astype(str).str[:4]
sns.set(font_scale = 1.5)
sns.catplot(x="year", hue="rain_classification_std",
data=df_rain_class, kind="count", height=8,
aspect=15/8).set(title='Frequency per rain '
'classification');
def plot_calendar_rain():
"""Plot total rainfall daily in calendar format"""
df_weather2 = pd.read_csv('df_weather.csv')
df_day_precip = df_weather2.groupby('date')['precipmm_std'].sum()
df_day_precip.index = pd.to_datetime(df_day_precip.index)
fig, ax = calmap.calendarplot(df_day_precip, fig_kws={"figsize":(36,15)},
fillcolor='grey', cmap = 'Blues')
fig.suptitle("Total Rainfall Daily" , y = 1.01, weight = 'bold',
fontsize = 25);
mpl.rcParams['font.family'] = 'DejaVu Sans'
def plot1(Z):
"""Accepts the output of linkage and replicates the plot"""
fig, ax = plt.subplots(figsize=(12,4), dpi=300)
dendrogram = sch.dendrogram(Z, truncate_mode='level',p=5)
plt.title('Traffic Clusters')
plt.xlabel('')
plt.ylabel(r'$\Delta$')
plt.show()
return ax
def convert_to_blocks(filepaths):
"""Clean and vectorize jams from linestrings to blocks"""
jams_lookup = pd.Series()
for filepath in filepaths:
print('')
print(filepath)
for df in pd.read_json(filepath,
orient='records',
lines=True,
chunksize=2E5):
print('.', end='')
df = df.drop(['type',
'startTimeMillis',
'uuid', 'street',
'city'], axis=1)
df['line'] = df.line.map(str)
jams = df[['line', 'length']].drop_duplicates()
jams = jams[~jams.line.isin(jams_lookup.index)]
jams.index = jams['line'].values.copy()
jams['line'] = jams.line.map(eval).map(
lambda line: shapely.geometry.linestring.LineString(
[list(pt.values()) for pt in line]))
jams['line'] = jams.apply(lambda jam: [
jam.line.interpolate(dist).xy for dist in np.linspace(
0, 1, jam.length//4+2)], axis=1)
jams['blocks'] = jams['line'].map(
lambda line: list(set(
[tuple(np.round(pt, 3).flatten()) for pt in line])))
jams_lookup = jams_lookup.append(jams['blocks'])
df = df.join(jams_lookup.to_frame(name='blocks'), on='line')
del df['line']
df['startTime'] = df.startTime.str[:-7]
df = df[df.speed>0]
df['speed'] = df['speed']*3.6
df.to_csv('waze.csv.gz',
compression='gzip',
index=False, mode='a', header=False)
def waze_dataset(nrows):
"""Reads the csv file 'waze.csv.gz' and adds datetime features
Parameters
----------
nrows = integer
Total # of rows to get in the dataset
Returns
-------
df : pd.DataFrame
Returns a pandas DataFrame of the waze dataset with datetime features
"""
df = pd.read_csv('waze.csv.gz', nrows=nrows)
df['blocks'] = df.blocks.map(eval)
df = df.explode('blocks')
df['startTime'] = pd.to_datetime(df.startTime.str[:-10],
format='%Y-%m-%d %H')
df['startTime'] += datetime.timedelta(hours=8)
df['date'] = df.startTime.dt.date
df['hour'] = df.startTime.dt.hour
df['dayofweek'] = df.startTime.dt.dayofweek
df['weekend'] = np.where((df.dayofweek == 5) | (df.dayofweek == 6), 1, 0)
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day
return df
def merge_waze_weather(df_waze):
"""Merges the waze and weather dataset
Parameters
----------
df_waze : pd.DataFrame
The waze dataset
Returns
-------
merged_df : pd.DataFrame
Returns the merged waze and weather dataset
"""
df_weather = pd.read_csv('df_weather.csv')
df_weather['rain_bucket'] = df_weather.precipmm.map(bucket_rain)
merged_df = df_waze.merge(df_weather, on=['year', 'month', 'day'])
merged_df['level_squared'] = merged_df.level**2
merged_df['month'] = merged_df['month'].apply(
lambda x: '{0:0>2}'.format(x))
merged_df['year-month'] = (merged_df['year'].astype(str)
+ '-'
+ merged_df['month'].astype(str))
return merged_df
def bucket_rain(mm):
"""Assign rain classification"""
if mm == 0:
return 'dry'
if mm < 2.5:
return 'drizzle'
if mm < 7.5:
return 'rain'
if mm >= 7.5:
return 'heavyrain'
def reverse_geo(df):
"""Returns the road name based on latitute and longitude"""
if ~any(df.columns.isin(['lat', 'lon'])):
df['lat'] = df.blocks.str[1]
df['lon'] = df.blocks.str[0]
df['coor'] = df.lat.astype(str) + ', ' + df.lon.astype(str)
else:
pass
road = []
for i in df.coor:
locator = Nominatim(user_agent='myGeocoder')
location = locator.reverse(i)
if 'road' in location.raw['address']:
road.append((i,location.raw['address']['road']))
else:
pass
road_df = pd.DataFrame(road, columns=['coor', 'road'])
return road_df
def seasonality_plot(merged_df, rainbucket=False):
"""Plots the seasonality trend of the Weather in Pasig City
Parameters
----------
merged_df : pd.DataFrame
Merge waze and weathers dataset
rainbucket : boolean
If 'True', it plots the seasonality trend with respect to the
rainbucket, else 'False', it plots the seasonality trend of the
weather
Returns
-------
Returns the seasonality trend of the weather in Pasig City
"""
if rainbucket:
df = merged_df.groupby(
['date', 'year-month', 'desc', 'rain_bucket'],
as_index=False)['level'].count()
else:
df = merged_df.groupby(
['date', 'year-month', 'desc'], as_index=False)['level'].count()
df = df.drop(['date', 'level'], axis=1)
seasonality = df.value_counts() \
.reset_index() \
.rename(columns={0: '_count'})
if rainbucket:
seasonality_plot = seasonality.pivot_table(index='year-month',
columns='rain_bucket',
values='_count',
fill_value=0)
# Seasonality plot
seasonality_plot.plot(
figsize=(15,7), xlabel='Month', ylabel='Count',
title='Seasonality Trend of Weather in Pasig City')
plt.show()
else:
seasonality_plot = seasonality.pivot_table(
index='year-month', columns='desc', values='_count', fill_value=0)
# Seasonality plot
seasonality_plot.plot(
figsize=(15,7), xlabel='Month', ylabel='Count',
title='Seasonality Trend of Weather in Pasig City')
plt.show()
def hourly_heatmap(merged_df):
"""Creates a heatmap of the traffic level by hour in Pasig City of the Top
25 roads
Parameters
----------
merged_df : pd.DataFrame
Merged waze and weather dataset
Returns
-------
Returns a heatmap of the traffic level by hour in Pasig City of the
Top 25 roads
"""
df = merged_df[['blocks', 'hour', 'level', 'level_squared']].copy()
heat_df = df.groupby(['blocks', 'hour'], as_index=False).mean()
heat_df['new_level'] = heat_df['level_squared'] / heat_df['level']
heat_df = heat_df.drop(['level', 'level_squared'], axis=1)
heat_df['lat'] = heat_df.blocks.str[1]
heat_df['lon'] = heat_df.blocks.str[0]
heat_df['coor'] = heat_df.lat.astype(str) + ', ' + heat_df.lon.astype(str)
count_df = Counter(heat_df.blocks)
top25 = sorted(count_df.items(),
key=lambda x: (x[1], x[0]), reverse=True)[:25]
top25 = [i[0] for i in top25]
final_df = heat_df[heat_df.blocks.isin(top25)]
heatg_df = reverse_geo(final_df).copy()
heatmap_df = final_df.merge(heatg_df.copy(), on='coor')
heat_pivot = heatmap_df.pivot_table(
index='road', columns='hour', values='new_level', fill_value=0)
fig, ax = plt.subplots(figsize=(15,7))
sns.heatmap(heat_pivot, cmap="YlOrBr")
plt.title('Traffic Level by hour in Pasig City')
plt.show()
def geo_plot(merged_df, plot_type):
"""Makes a geographical scatter or heatmap of the roads that are affected
by traffic due to heavyrain
Parameters
----------
merged_df : pd.DataFrame
Merged waze and weater dataset
plot_type : string
Either 'Scatter' or 'Heatmap'
Returns
-------
imap : Folium.map
Returns a geographical scatter plot or heatmap of the roads that are
affected by traffic due to heavyrain
"""
heavy_rain = merged_df[merged_df.rain_bucket == 'heavyrain']
heavyrain_df = heavy_rain.groupby('blocks', as_index=False)[
['speed', 'delay', 'level']].mean().sort_values(by=['level', 'delay'],
ascending=[False, False])
heavyrain_df.level = round(heavyrain_df.level).astype(int)
heavyrain_df = heavyrain_df[heavyrain_df.level != 5]
heavyrain_df['lat'] = heavyrain_df.blocks.str[1]
heavyrain_df['lon'] = heavyrain_df.blocks.str[0]
heavyrain_df = heavyrain_df.drop('blocks', axis=1)
color = {
0: 'green',
1: 'yellow',
2: 'orange',
3: 'red',
4: 'darkred'
}
heavyrain_df['color'] = heavyrain_df.level.map(color)
imap = folium.Map(location=[
heavyrain_df.lat.mean(), heavyrain_df.lon.mean()], zoom_start=10)
if plot_type == 'Scatter':
level = [folium.FeatureGroup(
name='<span style="color: {col};">{txt}</span>'.format(
txt='Level1', col='yellow')),
folium.FeatureGroup(
name='<span style="color: {col};">{txt}</span>'.format(
txt='Level2', col='orange')),
folium.FeatureGroup(
name='<span style="color: {col};">{txt}</span>'.format(
txt='Level3', col='red')),
folium.FeatureGroup(
name='<span style="color: {col};">{txt}</span>'.format(
txt='Level4', col='darkred'))]
for i, j in zip(level, [1, 2, 3, 4]):
heatmap_df = heavyrain_df[heavyrain_df.level == j]
for lat, lon, level, color in zip(heatmap_df.lat,
heatmap_df.lon,
heatmap_df.level,
heatmap_df.color):
i.add_child(folium.Circle(location=[lat, lon],
radius=50, weight=3,
opacity=level,
color=color, fill=True))
imap.add_child(i)
imap.add_child(folium.map.LayerControl())
return imap
elif plot_type == 'Heatmap':
heat_data = [[lat, lon] for lat, lon in zip(heavyrain_df['lat'],
heavyrain_df['lon'])]
HeatMap(heat_data).add_to(imap)
return imap
else:
raise ValueError('plot_type should either be Scatter or Heatmap')
def mapper_dry_vs_rainy(feature_compare='jam_speed_mean',
feature_plot='change_due_to_rain'):
"""Plot out features on a map"""
features = pd.read_csv('features_weather_new.csv.gz')
blocks = pd.read_csv('blocks_before_screening.csv.gz')
blocks['dry'] = features[f'{feature_compare}_rain:dry']
blocks['drizzle'] = features[f'{feature_compare}_rain:drizzle']
blocks['heavy_rain'] = features[f'{feature_compare}_rain:heavyrain']
blocks['rain_norm'] = blocks.heavy_rain/blocks.heavy_rain.mean()
blocks['dry_norm'] = blocks.dry/blocks.dry.mean()
blocks['change_due_to_rain'] = blocks.rain_norm - blocks.dry_norm
norm = mpl.colors.Normalize(vmin=blocks[feature_plot].min(),
vmax=blocks[feature_plot].max())
m = cm.ScalarMappable(norm=norm, cmap=cm.RdYlBu_r)
imap = folium.Map(location=[
blocks.lat.mean(), blocks.lon.mean()], zoom_start=14)
for ix, block in blocks.iterrows():
if not pd.isnull(block['change_due_to_rain']):
imap.add_child(
folium.Marker(
location=[block['lat'], block['lon']],
icon = (folium.DivIcon(
html=f"""<div><svg><rect width="10" height="10",
opacity=".8",
fill="{mpl.colors.rgb2hex(m.to_rgba(
block[feature_plot])[:3])}",
stroke="black", "stroke-width"="1"</svg></div>"""))))
return imap
Image('Pasig City_.jpg')
The Philippines is one of the countries where autos are widely used as a form of travel. With roughly 154 thousand passenger cars sold in 2020, the Philippines is placed 11th among Asia and Pacific countries in terms of passenger car sales. With the increasing number of cars sold in the Philippines and the country's climate change, traffic is unavoidable and will increase over time. We try to address how the traffic flow in different areas of Pasig City responds to wet weather using data from the Waze Connected Citizens Program and weather data.
To help us investigate our problem, we examined both the weather and traffic data of Pasig City. For the weather data, we scraped both the TimeandDate weather API and WorldWeatherOnline website to derive weather features. On the other hand, we obtained externally-sourced Waze data to extract transport-related features such as traffic and location with help of Thinking Machines and the Waze Connected Citizens Program. To aid our analysis, we used vector representation to segment the Pasig City Waze Traffic into 700 square blocks with derived on traffic features (average speed, number of traffic jams, portions of road). Feature engineering was done to derive additional features such as the amount of rainfall in a certain span of of day/week/month/year to examine the effect of weather features for each block at different times. Each block was then grouped through the help of clustering. Additionally, dimensionality reduction was done to extract significant features.
Upon clustering we were able to identify 3 main clusters according to their traffic volume and road type, and we decided to label them as routes:
Cluster 1: Major Highways includes roads guarantee you the fastest route with the highest potential speed, however because to the high volume of traffic and the route's length, most people will be significantly delayed along this route. This group includes major roadways. The Pasig City LGU and MMDA need to improve infrastructural capacity and traffic control in these locations, especially during severe weather.
Cluster 2: Intracity Routes contains roads that are ranked second in terms of weather-related delays, congestion, and speed. The majority of Pasig's areas fall into this category which indicates high urban congestion within the roads.
Cluster 3: Hidden Routes upon initial glance has roads which have the lowest maximum speed in the area that make them appear unpromising. However, they seem to be the smoothest and least congested routes. The majority of these are located in the outskirts of Pasig City. In times of congestion or inclement weather, the MMDA and Waze should guide drivers to these locations.
When we grouped clusters this way, we were able to reveal the city's various congested locations. The number of jams, maximum speed, and normalized delay for each block were the three most relevant characteristics in the clusters. All three are traffic characteristics that may be used to explain traffic congestion during different times of the day. As a consequence, we were able to decipher the clustering findings and glean useful information from the data.
Finally, we want to recommended the following to the relevant stakeholders of Pasig City and the National Government. First, we suggest increasing infrastructure capacity especially in Cluster 1 (Major Highways) to help in decreasing overall congestion among these roads especially since these are main thoroughfares. In the short-term, rerouting some of the traffic volume in the Main Highways to less-traversed roads such as those found in the third cluster (the Hidden Routes) may help in augmenting the congestion within the city during the rush hour. Lastly, for Cluster 2 (Intracity Routes), we suggest taking a closer a look at these areas for the crafting of long-term urban planning policies since points of interests such as commercial and public spaces are included here. In the process, this may improve the prospects of public transportation as well as route optimization and help in the overall traffic situation of Pasig in the future.
Recommendations to improve this study include expanding the dataset by integrating flood and accident reports, as well as possibly including non-jammed roads for a more complete picture of the traffic situation. This limitation could be augmented through twitter as well through NLP.
It is widely known but not quantified that rain and accompanying floods aggravate traffic. In fact, ADB dubbed Metro Manila as the most congested capital in 2019. Additionally, flooding in the region has been a persistent issue given the lack of proper infrastructure and preparation. Among these cities in the capital, Pasig City had some of the worst bottlenecks in the region according to the Metro Manila Development Authority (MMDA) and the continuous flooding incidents are known to aggravate the traffic.
Using the traffic data provided by the Waze Connected Citizens Program through Thinking Machines, and the weather data that was scraped from timeanddate.com and worldweatheronline.com, this report aims to determine how does the traffic flow in different areas of pasig city respond during the rainy weather and encourage more discussions on data-driven urban planning to assist LGUs in Disaster Risk Reduction & Traffic Management.
With the help of data from the Waze Connected Citizens Program and weather data, we try to answer the question, how the traffic flow in different areas in Pasig City respond during rainy weather?
As Metro Manila continues to congest due to rising population albeit lacking infrastructure, our urban planners are constantly in pursuit of an optimal solution. As discussed earlier, quantitative methods in identifying and measuring the extent of the problem are lacking. As such, data-driven approaches to answer our congestion woes are also deficient.
Data on traffic and its derivatives are abundant in the digital age; although not optimally utilized. For example, the MMDA has placed several cameras along the capital’s major thoroughfares, but it has yet to fully maximize it other than monitoring bottlenecks. We anticipate that research such as this should increase awareness of the availability of similar data sets such as with Waze.
Lastly, the results from our study are not limited to urban planning. We also hope that our findings will better equip local government units (other than Pasig City) in disaster risk reduction and traffic management. Although urban planning solves congestion in the long run, we intend our study to benefit the current needs of any city.
Image('Project_Methodology.png')
For the weather features, we scraped the data from Time and Date API and World Weather Online website. For the traffic features and blocks, we obtained the data externally via the Waze Connected Citizens' Program and Thinking Machines. Both the traffic and weather datasets were cleaned accordingly.
We created 700 traffic blocks (from coordinates) to represent Pasig City's based on areas of traffic.
For the engineered traffic features, we mapped average speed, number of traffic jams, portions of road for each square block. Subsequently the engineered weather features generated included the amount of rainfall in a certain span of days/hours combined with the existing traffic features.
For our EDA, we examined the trends for traffic, weather, and a combination of both over certain periods of time. We also analyzed the relationship between traffic and weather.
For clustering, we performed Ward's Clustering so that we can better see how the smaller clusters were lumped together. We then labeled the clusters appropriately based on locations and based on the features. In the process, we also applied dimensionality reduction to our dataset via Singular Value Decomposition (SVD) to interpret the significant features.
For the weather data, we gathered scraped the WorldWeatherOnline and Timeanddate in hourly intervals. We limit the listed variables to the ones relevant to the study.
The group collected weather data from 2 different sources since each source had different features:
The WorldWeatherOnline publishes weather data, including the amount of precipitation, in hourly intervals. However, there is no data specific to Pasig City. Thus, the group collected data for Quezon City, the nearest city from Pasig City, from 2018 to August 2021. The group used an API and the base requests to collect the data. We then used BeautifulSoup to parse the results. We kept each month as a separate CSV file.
requests. BeautifulSoup was also used to parse the results. We kept each month as a separate CSV file. For the traffic data, the team requested historical Waze jams data from Thinking Machines Data Science (TM). As a Waze Connected Citizens Partner, TM has been collecting traffic data on the entire country since 2018
The query used to extract data from the TM BigQuery dataset is below:
We extracted three years of Pasig City jams as a 700MB compressed json.
| Column | Data Type | Content |
|---|---|---|
| precipmm_std | float | Adjusted amount of precipitation in mm |
| 'rain_classification_std' | int | Rain classification based on PAGASA’s definition. 0 if precipmm_std is at 0mm, 1 if greater than 0 but below 2.5mm, 2 if greater or equal to 2.5mm but less than 7.5mm, 3 if greater or equal to 7.5mm |
| desc | text | Description of weather (i.e., Haze, Partly sunny, Overcast, Passing clouds, Light rain, etc.) |
The extracted data is composed of 2.5M rows of jams with a temporal granularity of 2 minutes. The accompanying data dictionary is illustrated below:
Figure X. Waze data specifications
The following pre-processing steps were performed for the weather dataset.
To combine the weather datasets, we interpolated the data from Timeanddate into hourly intervals based on its previous value from the 6-hour interval. The Timeanddate and World Weather Online datasets were then merged on the hourly intervals.
We noted that the data on the amount of precipitation from the WorldWeatherOnline dataset was understated since its annual precipitation level is at around 1,000mm or below. According to Climatemps, average annual precipitation level for Metro Manila is at 2,061mm. Thus, we adjust the rainfall data by dividing the hourly precipitation level by the fraction of the amount of rainfall for the year over 2,061.
We added the rain classification per hourly interval based on PAGASA’s definition.
pd.read_csv('df_weather.csv').head()
For data cleaning, we converted the timestamps from UTC to local and removed rows with speed=0 and delay=-1 because those signified road closures. Road closures persist for days to months at a time and keeping them would artificially inflate the number and total length of jams.
The jam location was also restructured from a list of latitude-longitude dictionaries to a Linestring WKT.

Figure X. A list of latitude-longitude dictionaries
Figure X. Linestring WKT
Waze data comes in the form of a linestring with information attached to it. These linestrings range from tens to thousands of meters and are thus not directly comparable. We need to get them into a form that’s more consistent.
We decide on using square blocks 0.001 decimal degrees or ~170m per side. Doing a spatial join is a computationally expensive and slow process, so we hack a simpler solution.
We convert the linestrings into a list of square blocks by doing the following:
pd.explode on the resulting list.Feature Engineering is the process of extracting features from raw data to include in the model. [1] We performed feature extraction on the Waze dataset and the scraped weather dataset. From there, we performed feature engineering to combine the two expanded datasets to produce a total of 2059 columns.
For each block representing a row, the features extracted included column-wise statistics such as the minimum, maximum, average, standard deviation of the jam speed. The count of jam speeds and unique road types were also features added. The mean and sum of delays were also calculated.
Interactions between other original features were included as well such as the mean jam speed multiplied by the mean delay determined to be similar to the length of the jam. For spatial features, the average and maximum of delay per jam length were also calculated as new features.
Next, these features were again further calculated temporally with varying granularity. These include month of the year, day of the week, hour of the week and if the day of occurrence was a weekday.
We get 1927 new columns from this extraction alone.
pd.read_csv('features_no_weather.csv.gz').head()
The feature engineering performed on the weather dataset mainly focused on precipitation. It was assumed to be the feature with the most effect on traffic flow.
Precipitation was first mapped to categories for interpretability. The four categories were defined by ranges of values and were named dry, drizzle, rain, heavyrain. Like the previous dataset, features were also calculated temporally with 3-hour and 6-hour buckets accordingly. These were then attached to the engineered features from the Waze dataset before temporal calculations.
pd.read_csv('features_weather_new.csv.gz').head()
df_waze = waze_dataset(50000)
merged_df = merge_waze_weather(df_waze)
The heatmap shows the traffic level by hour for the top 20 most common roads in Pasig City. It can be observed that roads like Bridgewater, Elpidio Santos, Francisco Soriano Street, M. Suarez Avenue and Pag-asa Street have an intense traffic level for 24 hours while the remaining roads like Ortigas Avenue Extension, C. Raymundo Avenue, and Reverend Dumandan Street have a light to moderate traffic level. On average, the intense traffic starts from 8pm and lessens by 5am the next day which is also evident in the Ave. Traffic Level per Hour graph but might not be true to some roads.
hourly_heatmap(merged_df)
The average traffic levels between weekdays and weekends differ greatly in the morning. However, both are relatively similar in the afternoon until the early evening. Weekend traffic tends to be worse in the afternoon. This suggests that leisure or household activity is high on the weekend afternoons. These activities are normally missed on weekdays, such as going to the mall or church, running errands, or trips out of town.
plot_traffic()
plot_hist_rain()
plot_rain_classification()
plot_calendar_rain()
The scatter plot shows the roads in Pasig City that experienced traffic congestion during normal rain. These are some of the roads that are affected by Level 4 Traffic or intense traffic: Pasig Boulevard, C. Raymundo Avenue, Danny Floro Street, Pearl Drive, Exchange Road, Ortigas Avenue and Ortigas-C5 Flyover. While these are the roads that are affected by Level 1 Traffic or light traffic: C. Raymundo Avenue corner Glorietta Street and Pasig Boulevard near Kawilihan Lane.
geo_plot(merged_df, 'Scatter')
The heatmap shows the areas of Pasig City that have traffic congestion during normal rain. Some notable areas affected by high traffic congestion are Ortigas, San Antonio, Kapitolyo, Bagong Ilog, Maybunga and Santa Lucia.
geo_plot(merged_df, 'Heatmap')
The map below shows the areas in Pasig City that experience changes in the speed of vehicles during rainy and dry weather. Some notable areas are Ortigas, San Antonio, and Kapitolyo which are similar with areas that have high traffic congestion. This means that the speed of vehicles is much lower in high traffic congested areas.
mapper_dry_vs_rainy(feature_compare='jam_speed_mean',
feature_plot='change_due_to_rain')
#load dataset
blocks2 = pd.read_csv('blocks_df.csv')
df_weather = pd.read_csv('features_weather_new.csv.gz'
,index_col='Unnamed: 0')
#impute missing values
all_features = df_weather.copy()
zero_cols = ['num_jams', 'delay_mean', 'jam_speedxdelay',
'jam_level_mean', 'delay_sum', 'jam_level_mean']
for col in all_features.columns:
if any(col.startswith(i) for i in zero_cols):
all_features[col] = all_features[col].fillna(0)
all_features = all_features.fillna(all_features.max())
In this section, we discuss the clustering results of the study. Several clustering methods were explored such as K-Means, Agglomerative, and Ward's Method. While results across these methods were similar, the group decided to choose the Ward's method for easier visualization and interpretability of the clustering process.
Hierarchical clustering connects pairs of clusters until the hierarchy contains every data object. The number of k-clusters is not pre-defined but is set after analysis depending on the dendogram or hierarchy. The Ward's method is an example of a hierarchical clustering method that prefers to merge the smaller of two pairs of clusters with evenly spaced-centers. It has the following distance metric between two clusters, $A$ and $B$, with an objective to minimize the growth of $ \Delta : $
$$ \Delta(A, B) = \sum_{i \in A \bigcup B} \|x_i - m_{A \bigcup B}\|^2 - \sum_{i \in A} \|x_i - m_A\|^2 - \sum_{i \in B} \|x_i - m_B\|^2 $$scaler = StandardScaler()
df_weather_scaled = scaler.fit_transform(all_features)
df_weather_scaled = pd.DataFrame(df_weather_scaled,
columns=all_features.columns,
index=all_features.index)
Z = linkage(df_weather_scaled, method='ward', optimal_ordering=True)
plot1(Z);
Based on the dendogram above, the biggest delta gap is from 100 to 150. No merging of clusters is happening in this interval, signifying significant difference among clusters. Thus, we set the threshold at this range to arrive at 3 defined clusters.
To evaluate the results of the cluster, we plot it against the first 2 components of SVD. We see that the clusters are separated with minimal overlap, suggesting good results.
svd = TruncatedSVD(n_components = 2,random_state=716,
n_iter=10)
df_weather_new = svd.fit_transform(df_weather_scaled.fillna(0))
blocks2['cluster'] = fcluster(Z, t=120, criterion='distance')
plt.scatter(df_weather_new[:,0], df_weather_new[:,1], c=blocks2.cluster);
We plot the city blocks based on its cluster for visualization. We clearly see patterns from the clustering.
plt.scatter(blocks2.lon, blocks2.lat, s=12, alpha=1, c=blocks2.cluster)
plt.gcf().set_size_inches(5.9, 9)
plt.axis('off');
Since we have 132 features, we rely on the principles of dimensionality reduction to aid in our interpretation of the clusters. We identify the features that contribute the most or have the highest weight against the first component of SVD. These features would explain the most variance across clusters. We list the top 50 features below.
Based on the results, the important features are the following:
These features relate to the amount of congestion based on total delay and number of jams, and traffic behavior based on maximum, average, and standard deviation of jam speed across weather patterns.
featimp = df_weather_scaled.columns[
np.argsort(np.abs(svd.components_[0]))[::-1]].tolist()
featimp[:50]
delay_per_length_max across clusters¶While the delay_per_length_max was not considered a top feature along SV1, this measures the maximum normalized delay. This allows us to understand how badly congested the areas can get, highlighting the need for better infrastructure capacity. We plot a histogram per cluster at a city block level below based on dry weather since this is the most prominent weather pattern.
We see that Cluster 3 has the lowest mean normalized delay, with most city blocks along very low levels of normalized delay. Thus, the worst congestion level along these areas may be deemed tolerable. On the other hand, most city blocks in Cluster 1 and 2 have high levels of normalized max delay, suggesting that a lot of areas can get extremely congested.
df_weather_merge = blocks2[['coords', 'cluster']].merge(df_weather,
left_on = 'coords',
right_index = True)
df_delay_per_length = df_weather_merge[['cluster',
'delay_per_length_max_rain:dry']]
f, axes = plt.subplots(3, 1, figsize=(15, 20))
for ax, cluster in zip(axes.flat, [1,2,3]):
ax.set_title(f"Cluster {cluster}",
fontsize = 12)
delay = (df_delay_per_length[df_delay_per_length.cluster ==
cluster]['delay_per_length_max_rain:dry'])
g = sns.distplot(delay, kde=False, bins=20, color="orange",
hist_kws=dict(alpha=0.4), ax=ax)
mean = delay.mean() #change to feature
ax.axvline(x=mean, color='orange', linestyle='--')
g.set(xlim=(0, 25))
plt.show()
We list the top 50 features based on SV1 per cluster basis by obtaining the mean.
df_weather_merge = blocks2[['coords', 'cluster']].merge(df_weather,
left_on = 'coords',
right_index = True)
df_weather_merge = df_weather_merge.groupby('cluster').mean().T
df_weather_merge.astype(float).round(2).loc[featimp[:50]]
These areas promise you the fastest route with highest maximum and average speed, but a lot will find themselves very delayed in times of congestion due to vehicular volume and its long stretch. There is a high risk, high reward tradeoff along these areas, highlighted by the very high standard deviation of jam speed. Thus, it may be recommendable for drivers to traverse these routes only when there is no congestion. The traffic behavior across different weather patterns remains consistent for the cluster. We list them as follows:
- Highest number of jams
- Highest standard deviation of jam speed
- Highest fastest jam speed
- Highest average jam speed
- Second highest max normalized delay
- Second highest slowest speed
Major highways fall under this cluster. Notable areas include Ortigas Ave. Extension, Marcos Highway, E. E. Rodriguez Jr. Ave, Amang Rodriguez Ave, and Ortigas Ave.
plt.scatter(blocks2[blocks2.cluster == 1].lon,
blocks2[blocks2.cluster == 1].lat,
s=12, alpha=1, color = 'purple')
plt.gcf().set_size_inches(5.9, 9)
plt.axis('off');
- Highest max normalized delay
- Second highest number of jams
- Second highest standard deviation
- Second highest fastest speed
- Second highest mean jamp speed
- Second highest standard deviation of jam speed
- Lowest slowest jam speed
plt.scatter(blocks2[blocks2.cluster == 2].lon,
blocks2[blocks2.cluster == 2].lat,
s=12, alpha=1, color = 'green')
plt.gcf().set_size_inches(5.9, 9)
plt.axis('off');
These routes do not seem promising given lowest max and mean speed in the area, but these are your smoothest routes with least congestion. With the highest slowest speed, these routes are recommended for drivers who would rather move at a slower pace than find themselves stuck. Traffic behavior along this area is very predictable given lowest level of standard deviation of jam speed and max normalized delay. The traffic behavior across different weather patterns remains consistent for the cluster. We list them as follows:
- Lowest number of jams
- Lowest max normalized delay
- Lowest fastest speed
- Lowest mean speed
- Lowest standard deviation in jam speed
- Highest slowest speed
These lie mostly in the outskirts of Pasig City. Notable areas include Axis Road, Samama Napindan Road, Daang Manunuso, Ipil, B. Tatco.
plt.scatter(blocks2[blocks2.cluster == 3].lon,
blocks2[blocks2.cluster == 3].lat,
s=12, alpha=1, color = 'orange')
plt.gcf().set_size_inches(5.9, 9)
plt.axis('off');
The initial methodology was to reduce the combined dataset with engineered features which had high dimensionality using Principal Component Analysis and apply some form of clustering after. Due to the imbalance of engineered features from each dataset, the traffic dataset dominated the resulting clusters and engineered features from weather were not present. Thus, the results were inconclusive and the methodology was revised to the one used now. These results were still included in the report under Appendix A.
The study entailed several steps from web scraping to vectorizing traffic data to performing feature engineering. With data vectorization, we were able to create a more consistent dataset which enabled us to perform the necessary analysis. Together with feature engineering, we were then able to extract more information that was not inherent in our raw data.
We then performed several clustering methods to interpret the data. These methods included K-Means and Agglomerative Methods, but we chose Ward’s Method for its easier visualization and interpretability of the clustering process.
The results showed three distinct clusters which were intuitive when plotted onto a map. To ease our interpretation of the clusters given 132 features, we evaluated the top features based on the first component of SVD. Below are the top features which we relied on to explain the difference across clusters:
These features relate to the amount of congestion based on total delay and number of jams, and traffic behavior based on maximum, average, and standard deviation of jam speed across weather patterns.
The clusters that emerged from the study were able to unveil the different areas of congestion in the city. Notably, one cluster consisted of routes that experienced the worst traffic jams despite its promise of faster accessibility across the city. We labeled these routes as the Main Highways because it included roads such as Ortigas Avenue. In addition, the clusters also pinpointed routes that were less likely to experience congestion but had the slowest speeds for traversing cars. We labeled these routes as Hidden Routes due to a lack of traffic jams but had more consistent vehicular flow. Lastly, we had the Intracity Routes which experienced the worst congestion and lowest maximum speeds. These routes were typically round shopping and business districts in Ortigas Center.
Given our study’s results, we have three main recommendations. First, there should be significant priority in managing the traffic jams in Cluster 1, which we identified as the Main Highways. These routes are predictably the biggest thoroughfares which are C5 (E. Rodriguez Avenue) and Ortigas Avenue. We further recommend increasing infrastructure capacity to address the congestion.
Second, to considerably ease the burden on the Main Highways, we also recommend rerouting traffic to the less-utilized roads (Hidden Routes) during rush hour. These routes may have low maximum speeds compared to the other roads in the city, but these are less susceptible to choking and the eventual traffic jams.
Third, our largest cluster was the Intracity Routes which had the highest normalized delays among the clusters. We believe this is where long-term urban planning policies may provide more effective traffic management. The cluster tends to highlight several points of interest, such as shopping malls, business districts, and institutional facilities. There could be room for more effective public transport options or routes. We expect such policies could ease traffic volumes within the city.
This study could also have benefitted from bashing the two datasets with actual records of flood reports from Waze alerts, but we were not able to successfully request the alerts table from Thinking Machines. Similar data and other details such as accidents could also be scraped from Twitter or news reports but would require some NLP.
Waze only records areas with jams, defined as moments when the average speed slows down to less than 80% of freeflow speed. To get a more representation of ground truth, each segment of road that is not jammed must be filled in with the freeflow speed. However, this is algorithmically complex and would bump up our dataset to dozens of GB and is outside the scope of this subject.
The original methodology planned on using the dataset with 2059 total features. We first applied dimensionality reduction to determine the minimum number of features to explain at least 90% of the variance. 125 principal components were needed for this from the 2059 features, with the first principal component reaching about 45%. The initial dimensionality reduction technique used was Principal Component Analysis rather than Singular Value Decomposition. SVD is more suited for sparse data and at this point, the 2059 total features have about 10% zero values from imputing NaN values. For our current methodology however, we believed this was not enough for the dataset to be considered a sparse matrix and that SVD would still be more appropriate in performing the dimensionality reduction. After this, we proceeded in using a clustering technique each from representative and hierarchical methods. Kmeans clustering was preferred with its internal validation checks and low cost of implementation. Plotted on the first and second PCA, these were the clustering results.
The resulting clusters when plotted on the map of Pasig yielded a great visual representation of the clusters eventually led us to use this algorithm on the cluster results in our current methodology.
Image('Appendix_A1.png')
However, when exploratory data analysis was done on the clusters and the principal components, the results were inconclusive to our problem statement as most of the weather-related features were no longer present in the principal components. The clusters themselves were lacking in differences when interpreting points of interest on the map especially if the three clusters were close together. The principal components failing to yield weather data was our biggest crutch as we could not provide any means to relate weather and traffic. Thus, the need for revision in our methodology.
Image('Appendix_A2.png')